In [1]:
from statsmodels.graphics.tsaplots import plot_pacf
from statsmodels.graphics.tsaplots import plot_acf
from statsmodels.tsa.stattools import pacf,acf
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from statsmodels.tsa.stattools import adfuller
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook
import numpy as np
import pandas as pd

from itertools import product

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline
In [2]:
df = pd.read_csv(r'C:\Users\muhammad.usman38\Downloads\83163_1980_1_1_2020.csv')
df.head()
Out[2]:
COOPID year month day meanTemp
0 83163 1980 1 1 61.5
1 83163 1980 1 2 55.0
2 83163 1980 1 3 60.5
3 83163 1980 1 4 66.5
4 83163 1980 1 5 57.5
In [3]:
df.dtypes
Out[3]:
COOPID        int64
year          int64
month         int64
day           int64
meanTemp    float64
dtype: object
In [4]:
df['Calculated_Date'] = df[['year', 'month', 'day']].apply(lambda x: '{}-{}-{}'.format(x[0], x[1], x[2]), axis=1)

df['Calculated_Date'].head()
Out[4]:
0    1980-1-1
1    1980-1-2
2    1980-1-3
3    1980-1-4
4    1980-1-5
Name: Calculated_Date, dtype: object
In [5]:
df['Calculated_Date'] = pd.to_datetime(df['Calculated_Date'])
df['meanTemp'] = pd.to_numeric(df['meanTemp'])
df = df.drop(labels=['COOPID','year','month','day'], axis=1)
df.head()
Out[5]:
meanTemp Calculated_Date
0 61.5 1980-01-01
1 55.0 1980-01-02
2 60.5 1980-01-03
3 66.5 1980-01-04
4 57.5 1980-01-05
In [6]:
df.dtypes
Out[6]:
meanTemp                  float64
Calculated_Date    datetime64[ns]
dtype: object
In [7]:
import plotly.graph_objects as go

import pandas as pd



# Create figure
fig = go.Figure()

fig.add_trace(
    go.Scatter(x =df['Calculated_Date'],y = df['meanTemp'])

# # Set title
# fig.update_layout(
#     title_text="Time series with range slider and selectors"
 )

# Add range slider
fig.update_layout(
    xaxis=dict(
        rangeselector=dict(
            buttons=list([
                dict(count=1,
                     label="1m",
                     step="month",
                     stepmode="backward"),
                dict(count=6,
                     label="6m",
                     step="month",
                     stepmode="backward"),
                dict(count=1,
                     label="YTD",
                     step="year",
                     stepmode="todate"),
                dict(count=1,
                     label="1y",
                     step="year",
                     stepmode="backward"),
                dict(step="all")
            ])
        ),
        rangeslider=dict(
            visible=True
        ),
        type="date"
    )
)

fig.show()
In [11]:
plot_pacf(df['meanTemp']);
plot_acf(df['meanTemp']);
In [12]:
from statsmodels.tsa.stattools import pacf,acf
import plotly.graph_objects as go

df_pacf = pacf(df['meanTemp'], nlags=300)
fig = go.Figure()
fig.add_trace(go.Scatter(
    x= np.arange(len(df_pacf)),
    y= df_pacf,
    name= 'PACF',
    ))
fig.update_xaxes(rangeslider_visible=True)
fig.update_layout(
    title="Partial Autocorrelation",
    xaxis_title="Lag",
    yaxis_title="Partial Autocorrelation",
         height=500,
    )
fig.show()
In [13]:
df_acf = acf(df['meanTemp'], nlags=300)
fig = go.Figure()
fig.add_trace(go.Scatter(
    x= np.arange(len(df_acf)),
    y= df_acf,
    name= 'ACF',
    ))
fig.update_xaxes(rangeslider_visible=True)
fig.update_layout(
    title="Autocorrelation",
    xaxis_title="Lag",
    yaxis_title="Autocorrelation",
         height=500,
    )
fig.show()
In [14]:
ad_fuller_result = adfuller(df['meanTemp'])
print(f'ADF Statistic: {ad_fuller_result[0]}')
print(f'p-value: {ad_fuller_result[1]}')
ADF Statistic: -8.521928239248195
p-value: 1.0947585212702767e-13
In [15]:
plt.hist(df['meanTemp'])
plt.show()
In [16]:
import plotly.express as px

fig = px.histogram(df, x=df['meanTemp'])
fig.update_xaxes(rangeslider_visible=True)
fig.show()
In [17]:
def optimize_SARIMA(parameters_list, d, D, s, exog):
    
    results = []
    
    for param in tqdm_notebook(parameters_list):
        try: 
            model = SARIMAX(exog, order=(param[0], d, param[1]), seasonal_order=(param[2], D, param[3], s)).fit(disp=-1)
        except:
            continue
            
        aic = model.aic
        results.append([param, aic])
        
    result_df = pd.DataFrame(results)
    result_df.columns = ['(p,q)x(P,Q)', 'AIC']
    #Sort in ascending order, lower AIC is better
    result_df = result_df.sort_values(by='AIC', ascending=True).reset_index(drop=True)
    
    return result_df
In [18]:
p = range(0, 3, 1)
d = 1
q = range(0, 3, 1)
P = range(0, 3, 1)
D = 1
Q = range(0, 3, 1)
s = 2
parameters = product(p, q, P, Q)
parameters_list = list(parameters)
print(len(parameters_list))
81
In [19]:
result_df = optimize_SARIMA(parameters_list, 1, 1, 3, df['meanTemp'])
result_df
  0%|          | 0/81 [00:00<?, ?it/s]
Out[19]:
(p,q)x(P,Q) AIC
0 (1, 2, 0, 2) 73260.017704
1 (1, 2, 1, 1) 73260.341933
2 (1, 2, 2, 1) 73260.447554
3 (2, 2, 0, 2) 73261.427256
4 (2, 2, 2, 1) 73262.238920
... ... ...
76 (2, 0, 0, 0) 86281.622303
77 (1, 1, 0, 0) 87183.400934
78 (0, 1, 0, 0) 87184.345989
79 (1, 0, 0, 0) 87316.712295
80 (0, 0, 0, 0) 87641.338987

81 rows × 2 columns

In [20]:
result_df['AIC'].idxmin()
Out[20]:
0
In [21]:
result_df[0:1]
Out[21]:
(p,q)x(P,Q) AIC
0 (1, 2, 0, 2) 73260.017704
In [ ]:
 
In [22]:
best_model = SARIMAX(df['meanTemp'], order=(1, 1, 2), seasonal_order=(0, 1, 2, 12)).fit(dis=-1)
print(best_model.summary())
                                     SARIMAX Results                                      
==========================================================================================
Dep. Variable:                           meanTemp   No. Observations:                14530
Model:             SARIMAX(1, 1, 2)x(0, 1, 2, 12)   Log Likelihood              -36633.936
Date:                            Tue, 02 Aug 2022   AIC                          73279.872
Time:                                    22:09:06   BIC                          73325.371
Sample:                                         0   HQIC                         73294.992
                                          - 14530                                         
Covariance Type:                              opg                                         
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.L1          0.4739      0.011     43.680      0.000       0.453       0.495
ma.L1         -0.5642      0.011    -52.059      0.000      -0.585      -0.543
ma.L2         -0.3079      0.008    -39.839      0.000      -0.323      -0.293
ma.S.L12      -0.9904      0.009   -104.596      0.000      -1.009      -0.972
ma.S.L24      -0.0094      0.007     -1.395      0.163      -0.023       0.004
sigma2         9.0558      0.082    110.877      0.000       8.896       9.216
===================================================================================
Ljung-Box (L1) (Q):                   0.22   Jarque-Bera (JB):             17882.21
Prob(Q):                              0.64   Prob(JB):                         0.00
Heteroskedasticity (H):               0.82   Skew:                            -1.10
Prob(H) (two-sided):                  0.00   Kurtosis:                         7.97
===================================================================================

Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
In [33]:
best_model.plot_diagnostics(figsize=(18, 8))
plt.show()
In [37]:
df['arima_model'] = best_model.fittedvalues
df['arima_model'][0:1] = np.NaN
forecast = best_model.predict(start=df.shape[0], end=df.shape[0] + 180)
forecast = df['arima_model'].append(forecast)
In [38]:
df.to_csv(r'C:\Users\muhammad.usman38\Desktop\test.csv')
df
Out[38]:
meanTemp Calculated_Date arima_model
0 61.5 1980-01-01 NaN
1 55.0 1980-01-02 61.500009
2 60.5 1980-01-03 54.999833
3 66.5 1980-01-04 60.499959
4 57.5 1980-01-05 66.499952
... ... ... ...
14525 77.0 2019-12-28 74.753703
14526 77.0 2019-12-29 75.336646
14527 77.5 2019-12-30 75.472485
14528 74.0 2019-12-31 75.818053
14529 69.0 2020-01-01 72.812581

14530 rows × 3 columns

In [39]:
plt.figure(figsize=(15, 7.5))
plt.plot(forecast, color='r', label='model')
plt.plot(df['meanTemp'], label='actual')
plt.legend()
plt.show()
In [41]:
import plotly.graph_objects as go

fig = go.Figure()
fig.add_trace(go.Scatter(x=df["Calculated_Date"], y=df["meanTemp"], name="Actual", mode="lines"))
fig.add_trace(go.Scatter(x=df["Calculated_Date"], y=df["arima_model"], name="Forecast", mode="lines"))
fig.update_layout(
    title="Model reults", xaxis_title="Date", yaxis_title="Temp"
)
fig.update_xaxes(rangeslider_visible=True)
fig.show()